load("/data/nycdoe/school_info_with_lat_long.Rdata")

#loda bio data 
load("/data/nycdoe/clean_data/common_data_frames/bio_data.Rdata")

load("/data/nycdoe/clean_data/common_data_frames/zoneData.Rdata")

boro_zip <- read.csv("/data/nycdoe/nyc_zip_to_borough.csv")

all_bio_data <- bio_data

zone_data <- zoneData

join boro zips to county (another way to detirming boro)

county <- c("005", "047", "061", "081", "085")
boro <- c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten")

county_bor <- data.frame(county, boro)

boro_zip_join <- full_join(boro_zip, county_bor, by = c("borough" = "boro"))

save(boro_zip_join, file = "/data/nycdoe/clean_data/boro_zip_join.Rdata")

load and select from zone data

zoneData <- zone_data %>% select(c(1,2,3,4,5,7,8))

nrow(zoneData) #9502347 rows in zoneData
## [1] 9502347
#zoneData %>% group_by(year)  %>% filter(is.na(res_zip_cde)) #23891 rowa

Join on bio on dbn and year

bio_data <- bio_data %>% select(student_id_scram, year, dbn, grade_level)

join_zone_bio <- inner_join(bio_data, zoneData, by = c("student_id_scram", "year"))

join_zone_bio %>% filter(is.na(res_zip_cde) & is.na(census_tract))#21,780
## # A tibble: 21,780 x 9
##    student_id_scram  year    dbn grade_level res_zip_cde zoned_elm_dbn
##               <int> <dbl>  <chr>       <chr>       <chr>         <chr>
##  1        701002241  2007 08X405          09        <NA>          <NA>
##  2        270002473  2007 13K336          06        <NA>          <NA>
##  3        602002359  2007 31R044          04        <NA>          <NA>
##  4        924102651  2007 11X178          02        <NA>          <NA>
##  5         38102775  2007 08X069          01        <NA>          <NA>
##  6        439202065  2007 02M047          07        <NA>          <NA>
##  7        687202456  2007 19K328          05        <NA>          <NA>
##  8        746202007  2007 21K281          06        <NA>          <NA>
##  9        920202898  2007 75X723          04        <NA>          <NA>
## 10        901202809  2007 17K353          06        <NA>          <NA>
## # ... with 21,770 more rows, and 3 more variables: zoned_mid_dbn <chr>,
## #   zoned_hs_dbn <chr>, census_tract <chr>
nrow(join_zone_bio) # 9465495  rows
## [1] 9465495
#  with_grade <- all_bios %>% select(student_id_scram, year, dbn, grade_level)
#  join_zone_bio_with_grade <- inner_join(with_grade, zoneData, by = c("student_id_scram", "year"))
# save(join_zone_bio_with_grade, file = "/data/nycdoe/clean_data/join_zone_bio_grade.Rdata")

clean data

student_go_to_schools <- join_zone_bio %>% select(c(1,2,3,4,5,9))
student_go_to_schools <- student_go_to_schools %>% mutate(res_zip_cde = as.integer(res_zip_cde))
## Warning in evalq(as.integer(res_zip_cde), <environment>): NAs introduced by
## coercion
#join to get county
student_go_to_schools_join_county <- full_join(student_go_to_schools, boro_zip_join, by = c("res_zip_cde" = "zipcode"))

student_go_to_schools_join_county <- student_go_to_schools_join_county %>% group_by(year, dbn, census_tract, county) %>% summarise(numStudents = n())

#filter na values
na_check <- student_go_to_schools_join_county %>% filter(is.na(census_tract) | is.na(county))

nrow(na_check)/nrow(student_go_to_schools_join_county)
## [1] 0.01311077
#dropping  0.01311077 (%) where either county or tract is na (county is na when there was no zip code)

student_go_to_schools_join_county  <- student_go_to_schools_join_county %>% filter(!is.na(census_tract) | !is.na(county))

pad census tract with 6 zeros and put back to char —same type as shapefile

#coerce to int
student_go_to_schools_join_county  <- student_go_to_schools_join_county %>% ungroup()  %>% mutate(census_tract = as.integer(census_tract))
## Warning in evalq(as.integer(census_tract), <environment>): NAs introduced
## by coercion
#pad with 0's
student_go_to_schools_join_county  <- student_go_to_schools_join_county  %>% mutate(census_tract =  sprintf("%06d", census_tract))

#put back to char
student_go_to_schools_join_county<- student_go_to_schools_join_county  %>% mutate(census_tract = as.character(census_tract))


#Check what was dropped ???? Are zips listed in every year???? yes there are zips every year, a negligible amount were dropped (they had no data at all)

join with lat long on dbn

student_go_to_schools_join_county_join_latLong <- inner_join(student_go_to_schools_join_county, lat_long, by = "dbn")

student_go_to_schools_join_county_join_latLong_join_name <- inner_join(student_go_to_schools_join_county_join_latLong, school_info, by = c("dbn" = "ATS System Code"))

Map the dbn

#load tract data
load('/data/nycdoe/nyc_tracts.Rdata')


#filter data to one year, one dbn
sample_map_data <- student_go_to_schools_join_county_join_latLong_join_name %>% filter(dbn =='02M475' & year ==2009)

#add the data to the sapcial file 
tracts_map <- merge(nyc_tracts, sample_map_data, by.x = c("TRACTCE","COUNTYFP"), by.y =c("census_tract", "county"))

#design color palette
pal3 <- colorNumeric(palette = "Reds",
                    domain = range(tracts_map@data$numStudents, na.rm=T), na.color = "#cccccc")


#nice school icon
school_icon <- makeIcon(
 iconUrl = "http://www.freeiconspng.com/uploads/high-school-icon-png-8.png",
 iconWidth = 38, iconHeight = 38)


#############################################
#get schools zones shapefile (THIS IS ELEMENTARY SCHOOLS!!!!)
r<- GET("https://data.cityofnewyork.us/api/geospatial/cq6p-iwiy?method=export&format=GeoJSON")
dbns <- readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
## No encoding supplied: defaulting to UTF-8.
## Warning in readOGR(content(r, "text"), "OGRGeoJSON", verbose = F): Dropping
## null geometries: 752
###############################################


#use leaflet to map data
leaflet(tracts_map) %>%
  #addTiles() %>% 
   addPolygons(data = dbns, color = "black") %>%
  addPolygons(weight = .5, fillColor = ~pal3(numStudents), popup = ~paste("Number of Students:", numStudents, "\nCensus Tract:", TRACTCE), fillOpacity = .7) %>% 
  addLegend(pal = pal3, values = ~numStudents, opacity = 1) %>%
   addMarkers(~lon, ~lat, icon = school_icon) %>%
  addProviderTiles("CartoDB.Positron") %>%
  setView(-73.98, 40.75, zoom = 13)
## Warning in validateCoords(lng, lat, funcName): Data contains 1275 rows with
## either missing or invalid lat/lon values and will be ignored

Make a function to do this for you

#function take dbn and make map
getMapFromDbn <- function(mydbn, myyear){
  sample_map_data <- student_go_to_schools_join_county %>% filter(dbn == mydbn & year == myyear)
  tracts_map <- merge(nyc_tracts, sample_map_data, by.x = c("TRACTCE","COUNTYFP"), by.y =c("census_tract", "county"))
  
  pal3 <- colorNumeric(palette = "Reds",
                    domain = range(tracts_map@data$numStudents, na.rm=T), na.color = "#cccccc")

leaflet(tracts_map) %>%
  #addTiles() %>% 
  addPolygons(weight = .5, fillColor = ~pal3(numStudents), popup = ~paste("Number of Students:", numStudents, "\nCensus Tract:", TRACTCE), fillOpacity = .7) %>% 
  addLegend(pal = pal3, values = ~numStudents, opacity = 1) %>%
  addProviderTiles("CartoDB.Positron") %>%
  setView(-73.98, 40.75, zoom = 13)
}


getMapFromDbn("10X368", 2008)
save(student_go_to_schools_join_county, file = "/data/nycdoe/clean_data/students_go_to_school_join_county.Rdata")